This notebook is arranged in the order of the presentation slides and is intended to be read along side it.
It provides code used to generate findings and plots. It also goes further than what is presented, with additional plots and stats.
A couple of the plots in the presentation were generated using the data here but in Tableau. The datasets are provided here.
If you have any questions - feel free to reach out to Ronen (ronenbecker@gmail.com)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(lubridate)
library(RColorBrewer)
library(reshape2)
library(tidyr)
library(ggcorrplot)
library(repr)
library(treemap)
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
# How we analysed it slide
# ----------------------------- # Hub Perspective :
## Convert to date format
Datathon$ActivityDate <- as.character(Datathon$ActivityDate)
Datathon$ActivityDate<- dmy(Datathon$ActivityDate)
#hub category aggregate Table
hub_cat_agg <- Datathon %>% group_by(HubRandomID) %>%
summarise(PA = sum(CategoryName == "Programmed Activities"),
EAT = sum(CategoryName == "Education and Training"),
OOE = sum(CategoryName == "One Off events"),
SR = sum(CategoryName == "Service Referrals"),
Min_DT = min(ActivityDate),
Max_DT = max(ActivityDate),
Duration = Max_DT - Min_DT,
duration_mnths = as.numeric((Duration/365))*12
)
#hub program (short name) aggregate Table
hub_prog_agg <- Datathon %>% group_by(HubRandomID, ShortName ) %>% summarise(cnt = n())
hub_prog_agg_corr <- pivot_wider(hub_prog_agg ,names_from = ShortName, values_from = c(cnt), values_fill = list(cnt = 0))
hub_cat_agg %>% ggplot(aes(x=duration_mnths)) +
xlab("Hub Running Duration in Months") +
ylab("Count") +
ggtitle("Hub Running Duration in Months", subtitle = "Distribution of Hub Durations for all Hubs") +
geom_histogram(fill ="#377EB8", bins = 7, alpha = 0.7) +
theme(
plot.title = element_text(color="#4DAF4A", size=22, face="bold.italic"),
axis.title.x = element_text(color="#E41A1C", size=14, face="bold"),
axis.title.y = element_text(color="#E41A1C", size=14, face="bold"),
axis.text=element_text(size=16)
)
# Load the Data
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
Datathon$ActivityDate <- as.character(Datathon$ActivityDate)
Datathon$ActivityDate<- dmy(Datathon$ActivityDate)
### Get a long table for all SRS for all Hubs
hub_pa_agg_mth <- Datathon %>%
#### Remove all Activities with 0 attendees unless they are Service Referrals:
filter((CategoryName != "Service Referrals" & (ChildParticipants != 0 | AdultParticipants !=0 )) | CategoryName == "Service Referrals" ) %>%
#### Remove all SR with 'Number of Families Participating'
filter(ShortName != "Number of families participating in the Hub") %>%
group_by(HubRandomID, CategoryName ) %>%
summarise(cnt = n())
# get min and max dates for each hub and join back to table
hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
max_dt = max(ActivityDate),
duration = max_dt - min_dt)
hub_pa_agg_mth <- hub_pa_agg_mth %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Join maxmin dates back to table:
hub_pa_agg_mth <- hub_pa_agg_mth %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth # PAs per month
)
# Create a wide table useful for analysis
hub_pa_agg_mth_wide <- pivot_wider(hub_pa_agg_mth ,names_from = CategoryName, values_from = c(cnt, avg_cnt_mnth), values_fill = list(cnt = 0, avg_cnt_mnth =0))
### Rename columns:
hub_pa_agg_mth_wide <- hub_pa_agg_mth_wide %>% rename(
cnt_EAT = "cnt_Education and Training",
cnt_OOE = "cnt_One Off events",
cnt_PA = "cnt_Programmed Activities",
cnt_SR = "cnt_Service Referrals",
# avg_cnt_mth
avg_cnt_mth_EAT = "avg_cnt_mnth_Education and Training",
avg_cnt_mth_OOE = "avg_cnt_mnth_One Off events",
avg_cnt_mth_PA = "avg_cnt_mnth_Programmed Activities",
avg_cnt_mth_SR = "avg_cnt_mnth_Service Referrals"
) #For renaming tibble column using dplyrpipe
#operator
### Calculate create feature as proportion of month:
hub_pa_agg_mth_wide <- hub_pa_agg_mth_wide %>% mutate( total_avg_cnt_mth = avg_cnt_mth_EAT+avg_cnt_mth_OOE+avg_cnt_mth_PA+avg_cnt_mth_SR,
PA_avg_prop = avg_cnt_mth_PA / total_avg_cnt_mth,
EAT_avg_prop = avg_cnt_mth_EAT/ total_avg_cnt_mth,
OOE_avg_prop = avg_cnt_mth_OOE/ total_avg_cnt_mth,
SR_avg_prop = avg_cnt_mth_SR/ total_avg_cnt_mth,
SR_PA_ratio = avg_cnt_mth_SR/avg_cnt_mth_PA)
hub_pa_agg_mth_wide %>% ungroup() %>% summarise(mean_OOE = mean(avg_cnt_mth_OOE),
mean_PA = mean(avg_cnt_mth_PA),
mean_SR = mean(avg_cnt_mth_SR),
mean_EAT = mean(avg_cnt_mth_EAT)
)
Approach:
Observations:
hub_pa_agg_mth_wide %>% ungroup() %>% summarise(mean_OOE = mean(OOE_avg_prop),
mean_PA = mean(PA_avg_prop),
mean_SR = mean(SR_avg_prop),
mean_EAT = mean(EAT_avg_prop)
)
Approach:
Observations:
hub_cat_agg %>% select("HubRandomID", "Programmed Activities" = "PA","Education and Training" = "EAT","One Off events"= "OOE","Service Referrals" ="SR" ) %>%
pivot_longer(cols = c(-HubRandomID), names_to = "Category") %>% # put all colums other than HubRandomID into Category
ggplot(aes(x= factor(HubRandomID), y = value ,fill = factor(Category) )) +
ggtitle("Hubs by Program Category Types", subtitle = "Category Proportion for Each Hub") +
geom_bar(position = "fill", stat="identity" , alpha = 0.9 ) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
legend.title =element_blank(),
plot.title = element_text(color="#D95F02", size=22, face="bold.italic"),
axis.title.x = element_text(color="#D53E4F", size=14, face="bold"),
axis.title.y = element_text(color="#D53E4F", size=14, face="bold"),
axis.text=element_text(size=16)
) +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
ylab ("Proportion of Activities") +
xlab ("Hubs") +
scale_fill_brewer(palette = "Set1", type = "div", direction = -1)
options(repr.plot.width=30, repr.plot.height=15)
Observations:
hub_pa_agg_mth %>% filter(CategoryName == "Programmed Activities") %>%
ggplot(aes(x=avg_cnt_mnth)) +
xlab("Average PA per Hub per Month") +
ggtitle("Average PA Per Hub per Month", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.7, aes(fill = "light green") )+
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width=30, repr.plot.height=6)
hub_pa_agg_mth %>% ungroup() %>% filter(CategoryName == "Programmed Activities") %>%
summarise(min = min(avg_cnt_mnth),
q1 = quantile(avg_cnt_mnth,0.25),
mean = mean(avg_cnt_mnth),
median = median(avg_cnt_mnth),
q3 = quantile(avg_cnt_mnth,0.75),
max = max(avg_cnt_mnth),
sd = sd(avg_cnt_mnth),
IQR = IQR(avg_cnt_mnth),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
)
Approach:
Observations:
hub_pa_agg_mth %>% ungroup() %>% filter(CategoryName == "Programmed Activities") %>% filter (avg_cnt_mnth <= 10) %>% arrange(avg_cnt_mnth)
This sections is a more detailed look at the above, looking at the average per month based on PA types.
### Get a long table for all SRS for all Hubs
hub_pa_agg <- Datathon %>% filter(CategoryName == "Programmed Activities" & (ChildParticipants != 0 | AdultParticipants !=0 )) %>% group_by(HubRandomID, ShortName ) %>% summarise(
cnt = n(),
cnt_child = sum(ChildParticipants),
cnt_adult = sum(AdultParticipants),
avg_cnt_pa = ((cnt_child + cnt_adult) / cnt), # avg attendance per session
avg_child_pa = cnt_child/cnt, # average child attendance per session
avg_adult_pa = cnt_adult/cnt) # average adult attendance per session
# get min and max dates for each hub and join back to table
#hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
# max_dt = max(ActivityDate),
# duration = max_dt - min_dt
# )
# Join maxmin dates back to table:
hub_pa_agg <- hub_pa_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
hub_pa_agg <- hub_pa_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Develop wide table with column for each SR based on avg cnt per month
hub_pa_agg_wide <- hub_pa_agg %>% select(HubRandomID, ShortName, min_dt, max_dt, duration_mnth,avg_cnt_mnth ) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_mnth), values_fill = list(cnt = 0))
hub_pa_agg %>% ggplot(aes( x= factor(ShortName), y=avg_cnt_mnth, fill = factor(ShortName))) +
coord_flip() +
theme(legend.position = "none") +
xlab("Program Activities") +
ylab("Average Per Month") +
ggtitle("Average Program Activities Per Month", subtitle = "Hub Distribution") +
#theme(axis.title.x=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
theme(
# axis.title.x=element_blank(),
#axis.text.y=element_blank(),
#axis.ticks.y=element_blank(),
legend.title =element_blank(),
plot.title = element_text(color="#D95F02", size=22, face="bold.italic"),
axis.title.x = element_text(color="#D53E4F", size=14, face="bold"),
axis.title.y = element_text(color="#D53E4F", size=14, face="bold"),
axis.text=element_text(size=16)
) +
#scale_x_discrete(expand = c(0,0)) +
#scale_y_discrete(expand = c(0,0)) +
scale_fill_brewer(palette = "Set1", type = "div", direction = 1) +
geom_boxplot(alpha =0.9)
options(repr.plot.width=30, repr.plot.height=10)
hub_pa_agg %>%
ggplot(aes(x=avg_cnt_mnth)) +
xlab("Average Count Per Month") +
ggtitle("Average PA Per Month", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.1, aes(fill =factor(ShortName)) )+
facet_grid(rows = vars(ShortName)) +
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width= 30, repr.plot.height=12)
hub_pa_stats<- hub_pa_agg %>% group_by(ShortName) %>% summarise(min = min(avg_cnt_mnth),
q1 = quantile(avg_cnt_mnth,0.25),
mean = mean(avg_cnt_mnth),
median = median(avg_cnt_mnth),
q3 = quantile(avg_cnt_mnth,0.75),
max = max(avg_cnt_mnth),
sd = sd(avg_cnt_mnth),
IQR = IQR(avg_cnt_mnth),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
) %>% arrange(desc(IQR))
hub_pa_stats
Note: For these plots we excluded PA without attendees
There a some outlier hubs who run more than what is typical a month:
hub_pa_agg %>% filter(
(ShortName %in% c("Lifestyle","Playgroups", "Parents" ) & avg_cnt_mnth> 12 ) |
(ShortName == "ChildLit" & avg_cnt_mnth >7.3) |
(ShortName == "EnglishClasses" & avg_cnt_mnth > 8.8)
) %>% arrange(ShortName)
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
### Get a long table for all PAs for all Hubs
hub_agg <- Datathon %>% filter(CategoryName == "Programmed Activities" & (ChildParticipants != 0 | AdultParticipants !=0 ))%>% group_by(HubRandomID) %>% summarise(
cnt = n(),
cnt_child = sum(ChildParticipants),
cnt_adult = sum(AdultParticipants),
avg_cnt = ((cnt_child + cnt_adult) / cnt), # avg attendance per session
avg_child = cnt_child/cnt, # average child attendance per session
avg_adult = cnt_adult/cnt) # average adult attendance per session
# get min and max dates for each hub and join back to table
#hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
# max_dt = max(ActivityDate),
# duration = max_dt - min_dt
# )
# Join maxmin dates back to table:
hub_agg <- hub_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
hub_agg <- hub_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Develop wide table with column for each SR based on avg cnt per month
#hub_agg_wide <- hub_agg %>% select(HubRandomID, min_dt, max_dt, duration_mnth,avg_cnt_mnth ) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_mnth), values_fill = list(cnt = 0))
hub_agg %>% ggplot(aes( x= 1, y=avg_cnt)) +
coord_flip() +
theme(legend.position = "none") +
xlab("HubID") +
ylab("Average Attendance Per Session") +
ggtitle("Average Attendance Per Session", subtitle = "Hub Distribution") +
ylim(0, 35) +
#theme(axis.title.x=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
geom_boxplot()
options(repr.plot.width=30, repr.plot.height=5)
hub_agg %>%
ggplot(aes(x=avg_cnt)) +
xlab("Average Attendance per Hub") +
ggtitle("Average Attendance Per Hub", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.7, aes(fill = "light green") )+
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width=30, repr.plot.height=6)
hub_agg_stats<- hub_agg %>% summarise(min = min(avg_cnt),
q1 = quantile(avg_cnt,0.25),
mean = mean(avg_cnt),
median = median(avg_cnt),
q3 = quantile(avg_cnt,0.75),
max = max(avg_cnt),
sd = sd(avg_cnt),
IQR = IQR(avg_cnt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
)
hub_agg_stats
hub_agg %>% filter(avg_cnt >= 50)
hub_agg %>%
ggplot(aes(x=avg_child)) +
xlab("Average Attendance per Hub") +
ggtitle("Average Attendance Per Hub", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.7, aes(fill = "light green") )+
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width=30, repr.plot.height=6)
hub_agg_stats<- hub_agg %>% summarise(min = min(avg_child),
q1 = quantile(avg_child,0.25),
mean = mean(avg_child),
median = median(avg_child),
q3 = quantile(avg_child,0.75),
max = max(avg_child),
sd = sd(avg_child),
IQR = IQR(avg_child),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
)
hub_agg_stats
hub_agg %>% filter(avg_child >= 32)
hub_agg %>%
ggplot(aes(x=avg_adult)) +
xlab("Average Attendance per Hub") +
ggtitle("Average Attendance Per Hub", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.5, aes(fill = "light green") )+
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width=30, repr.plot.height=6)
hub_agg_stats<- hub_agg %>% summarise(min = min(avg_adult),
q1 = quantile(avg_adult,0.25),
mean = mean(avg_adult),
median = median(avg_adult),
q3 = quantile(avg_adult,0.75),
max = max(avg_adult),
sd = sd(avg_adult),
IQR = IQR(avg_adult),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
)
hub_agg_stats
hub_agg %>% filter(avg_adult >= 16)
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
### Get a long table for all SRS for all Hubs
hub_pa_agg <- Datathon %>% filter(CategoryName == "Programmed Activities" & (ChildParticipants != 0 | AdultParticipants !=0 )) %>% group_by(HubRandomID, ShortName ) %>% summarise(
cnt = n(),
cnt_child = sum(ChildParticipants),
cnt_adult = sum(AdultParticipants),
avg_cnt_pa = ((cnt_child + cnt_adult) / cnt), # avg attendance per session
avg_child_pa = cnt_child/cnt, # average child attendance per session
avg_adult_pa = cnt_adult/cnt) # average adult attendance per session
# get min and max dates for each hub and join back to table
#hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
# max_dt = max(ActivityDate),
# duration = max_dt - min_dt
# )
# Join maxmin dates back to table:
hub_pa_agg <- hub_pa_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
hub_pa_agg <- hub_pa_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Develop wide table with column for each SR based on avg cnt per month
hub_pa_agg_wide <- hub_pa_agg %>% select(HubRandomID, ShortName, min_dt, max_dt, duration_mnth,avg_cnt_mnth ) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_mnth), values_fill = list(cnt = 0))
hub_pa_agg %>% ggplot(aes( x= factor(ShortName), y=avg_cnt_pa, fill = factor(ShortName))) +
coord_flip() +
theme(legend.position = "none") +
xlab("Program Activities") +
ylab("Average Attendance Per PA") +
ggtitle("Average Attendance Per Program Activities", subtitle = "Hub Distribution") +
theme(axis.text=element_text(size=16)) +
#theme(axis.title.x=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
geom_boxplot()
options(repr.plot.width=30, repr.plot.height=15)
hub_pa_agg %>%
ggplot(aes(x=avg_cnt_pa)) +
xlab("Average Count Per Month") +
ggtitle("Average PA Per Month", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 1, aes(fill =factor(ShortName)) )+
facet_grid(rows = vars(ShortName)) +
theme(legend.position = "none",
axis.text=element_text(size=16)
)
options(repr.plot.width= 30, repr.plot.height=12)
hub_pa_stats<- hub_pa_agg %>% group_by(ShortName) %>% summarise(min = min(avg_cnt_pa),
q1 = quantile(avg_cnt_pa,0.25),
mean = mean(avg_cnt_pa),
median = median(avg_cnt_pa),
q3 = quantile(avg_cnt_pa,0.75),
max = max(avg_cnt_pa),
sd = sd(avg_cnt_pa),
IQR = IQR(avg_cnt_pa),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
) %>% arrange(desc(mean))
hub_pa_stats
Below are some outliers:
hub_pa_agg %>% filter(
(ShortName == "Playgroups" & avg_cnt_pa >45 |
ShortName == "Parents" & avg_cnt_pa >67 |
ShortName == "Lifestyle" & avg_cnt_pa >92 |
ShortName == "EnglishClasses" & avg_cnt_pa >24 |
ShortName == "ChildLit" & avg_cnt_pa >43
))%>% arrange(ShortName)
hub_pa_adult_stats<- hub_pa_agg %>% group_by(ShortName) %>% summarise(min = min(avg_adult_pa),
q1 = quantile(avg_adult_pa,0.25),
mean = mean(avg_adult_pa),
median = median(avg_adult_pa),
q3 = quantile(avg_adult_pa,0.75),
max = max(avg_adult_pa),
sd = sd(avg_adult_pa),
IQR = IQR(avg_adult_pa),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
) %>% arrange(desc(ShortName))
hub_pa_adult_stats
hub_pa_child_stats<- hub_pa_agg %>% group_by(ShortName) %>% summarise(min = min(avg_child_pa),
q1 = quantile(avg_child_pa,0.25),
mean = mean(avg_child_pa),
median = median(avg_child_pa),
q3 = quantile(avg_child_pa,0.75),
max = max(avg_child_pa),
sd = sd(avg_child_pa),
IQR = IQR(avg_child_pa),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt)
) %>% arrange(desc(ShortName))
hub_pa_child_stats
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
### Convert Activity Date to Lubridate Date
Datathon$ActivityDate <- as.character(Datathon$ActivityDate)
Datathon$ActivityDate<- dmy(Datathon$ActivityDate)
### Get a long table for all SRS for all Hubs
hub_sr_agg <- Datathon %>%
#### Remove all Activities with 0 attendees unless they are Service Referrals:
filter(CategoryName == "Service Referrals") %>%
#### Remove all SR with 'Number of Families Participating'
filter(ShortName != "Number of families participating in the Hub" &
ShortName != 'Children'&
ShortName != 'Adults'
) %>%
group_by(HubRandomID, ShortName ) %>%
summarise(cnt = n())
# get min and max dates for each hub and join back to table
hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
max_dt = max(ActivityDate),
duration = max_dt - min_dt
)
# Join maxmin dates back to table:
hub_sr_agg <- hub_sr_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
hub_sr_agg <- hub_sr_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Develop wide table with column for each SR based on avg cnt per month
hub_sr_agg_wide <- hub_sr_agg %>% select(HubRandomID, ShortName, min_dt, max_dt, duration_mnth,avg_cnt_mnth ) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_mnth), values_fill = list(cnt = 0))
relevant_srs_wide <- hub_sr_agg_wide
relevant_srs <- hub_sr_agg
colourCount = length(unique(relevant_srs $ShortName))
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
relevant_srs %>% ggplot(aes( x= factor(ShortName), y=avg_cnt_mnth, fill = factor(ShortName))) +
coord_flip() +
theme(legend.position = "none") +
theme(
# axis.title.x=element_blank(),
#axis.text.y=element_blank(),
#axis.ticks.y=element_blank(),
legend.title =element_blank(),
plot.title = element_text(color="#D95F02", size=22, face="bold.italic"),
axis.title.x = element_text(color="#D53E4F", size=14, face="bold"),
axis.title.y = element_text(color="#D53E4F", size=14, face="bold"),
axis.text=element_text(size=16)
) +
#scale_x_discrete(expand = c(0,0)) +
#scale_y_discrete(expand = c(0,0)) +
# scale_fill_brewer(colorRampPalette(brewer.pal(9, "Set1")), type = "div", direction = 1) +
geom_boxplot(alpha =1) +
xlab("Service Referral") +
ylab("Average Per Month") +
ggtitle("Average Service Referrals Per Month", subtitle = "Hub Distribution") +
theme(axis.title=element_text(size=12,face="bold"),
axis.text=element_text(size=16),
) +
#theme(axis.title.x=element_blank(), axis.text.y=element_blank(), axis.ticks.y=element_blank()) +
geom_boxplot(fill=getPalette(colourCount))
options(repr.plot.width=30, repr.plot.height=15)
relevant_srs %>%
ggplot(aes(x=avg_cnt_mnth)) +
xlab("Average SR Per Month") +
ggtitle("Average SR Per Month", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.02, aes(fill =factor(ShortName)) )+
facet_grid(rows = vars(ShortName)) +
theme(legend.position = "none",
axis.text=element_text(size=16),
)
options(repr.plot.width=30, repr.plot.height=15)
relevant_srs_stats<- relevant_srs %>% group_by(ShortName) %>% summarise(min = min(avg_cnt_mnth),
q1 = quantile(avg_cnt_mnth,0.25),
mean = mean(avg_cnt_mnth),
median = median(avg_cnt_mnth),
q3 = quantile(avg_cnt_mnth,0.75),
max = max(avg_cnt_mnth),
sd = sd(avg_cnt_mnth),
IQR = IQR(avg_cnt_mnth),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
) %>% arrange(desc(mean))
relevant_srs_stats
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
Datathon$ActivityDate <- as.character(Datathon$ActivityDate)
Datathon$ActivityDate<- dmy(Datathon$ActivityDate)
### Get a long table for all SRS for all Hubs
hub_pa_agg_mth <- Datathon %>%
#### Remove all Activities with 0 attendees unless they are Service Referrals:
filter((CategoryName != "Service Referrals" & (ChildParticipants != 0 | AdultParticipants !=0 )) | CategoryName == "Service Referrals" ) %>%
#### Remove all SR with 'Number of Families Participating'
filter(ShortName != "Number of families participating in the Hub" &
ShortName != 'Children'&
ShortName != 'Adults'
) %>%
group_by(HubRandomID, CategoryName ) %>%
summarise(cnt = n())
# get min and max dates for each hub and join back to table
hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
max_dt = max(ActivityDate),
duration = max_dt - min_dt)
hub_pa_agg_mth <- hub_pa_agg_mth %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Join maxmin dates back to table:
hub_pa_agg_mth <- hub_pa_agg_mth %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Create a wide table useful for analysis
hub_pa_agg_mth_wide <- pivot_wider(hub_pa_agg_mth ,names_from = CategoryName, values_from = c(cnt, avg_cnt_mnth), values_fill = list(cnt = 0, avg_cnt_mnth =0))
### Rename columns:
hub_pa_agg_mth_wide <- hub_pa_agg_mth_wide %>% rename(
cnt_EAT = "cnt_Education and Training",
cnt_OOE = "cnt_One Off events",
cnt_PA = "cnt_Programmed Activities",
cnt_SR = "cnt_Service Referrals",
# avg_cnt_mth
avg_cnt_mth_EAT = "avg_cnt_mnth_Education and Training",
avg_cnt_mth_OOE = "avg_cnt_mnth_One Off events",
avg_cnt_mth_PA = "avg_cnt_mnth_Programmed Activities",
avg_cnt_mth_SR = "avg_cnt_mnth_Service Referrals"
) #For renaming tibble column using dplyrpipe
#operator
### Calculate create feature as proportion of month:
hub_pa_agg_mth_wide <- hub_pa_agg_mth_wide %>% mutate( total_avg_cnt_mth = avg_cnt_mth_EAT+avg_cnt_mth_OOE+avg_cnt_mth_PA+avg_cnt_mth_SR,
PA_avg_prop = avg_cnt_mth_PA / total_avg_cnt_mth,
EAT_avg_prop = avg_cnt_mth_EAT/ total_avg_cnt_mth,
OOE_avg_prop = avg_cnt_mth_OOE/ total_avg_cnt_mth,
SR_avg_prop = avg_cnt_mth_SR/ total_avg_cnt_mth,
SR_PA_ratio = avg_cnt_mth_SR/avg_cnt_mth_PA
)
colourCount = length(unique(hub_pa_agg_mth_wide$HubRandomID))
hub_pa_agg_mth_wide %>%
ggplot(aes(x=SR_PA_ratio)) +
xlab("Average Service Referrals per Program Activity") +
ylab("Proportion") +
ggtitle("Average Service Referrals per Program Activity across all hubs", subtitle = "Hub Distribution")+
geom_dotplot(binpositions="all",stackgroups = TRUE,binwidth = 0.08, fill=getPalette(colourCount))+
theme(legend.position = "none",
axis.text=element_text(size=16)
) +
theme(legend.position = "none") +
theme(
# axis.title.x=element_blank(),
#axis.text.y=element_blank(),
#axis.ticks.y=element_blank(),
legend.title =element_blank(),
plot.title = element_text(color="#7FC97F", size=22, face="bold.italic"),
axis.title.x = element_text(color="#D53E4F", size=14, face="bold"),
axis.title.y = element_text(color="#D53E4F", size=14, face="bold"),
axis.text=element_text(size=16)
)+
scale_y_continuous(NULL, breaks = NULL)
hub_pa_agg_mth_wide %>% ungroup() %>% summarise(min = min(SR_PA_ratio),
q1 = quantile(SR_PA_ratio,0.25),
mean = mean(SR_PA_ratio),
median = median(SR_PA_ratio),
q3 = quantile(SR_PA_ratio,0.75),
max = max(SR_PA_ratio),
sd = sd(SR_PA_ratio),
IQR = IQR(SR_PA_ratio),
latest_open_dt = max(min_dt),
latest_activity_dt = max(max_dt),
LowOutlier= (q1 - 1.5*IQR),
HighOutliet = (q3 + 1.5*IQR)
) %>% arrange(desc(IQR))
hub_pa_agg_mth_wide %>% filter (SR_PA_ratio >3) %>% select(-PA_avg_prop, -EAT_avg_prop)
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
### Get a long table for all SRS for all Hubs
hub_pa_agg <- Datathon %>% filter(CategoryName == "Programmed Activities" & (ChildParticipants != 0 | AdultParticipants !=0 )) %>% group_by(HubRandomID, ShortName ) %>% summarise(
cnt = n(),
cnt_child = sum(ChildParticipants),
cnt_adult = sum(AdultParticipants),
avg_cnt_pa = ((cnt_child + cnt_adult) / cnt), # avg attendance per session
avg_child_pa = cnt_child/cnt, # average child attendance per session
avg_adult_pa = cnt_adult/cnt) # average adult attendance per session
# get min and max dates for each hub and join back to table
#hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
# max_dt = max(ActivityDate),
# duration = max_dt - min_dt
# )
# Join maxmin dates back to table:
hub_pa_agg <- hub_pa_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
hub_pa_agg <- hub_pa_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
avg_cnt_mnth = cnt/ duration_mnth
)
# Develop wide table with column for each SR based on avg cnt per month
hub_pa_agg_wide <- hub_pa_agg %>% select(HubRandomID, ShortName, min_dt, max_dt, duration_mnth,avg_cnt_mnth ) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_mnth), values_fill = list(avg_cnt_mnth = 0))
data_clust <- hub_pa_agg_wide %>% ungroup() %>%select(-HubRandomID,-min_dt,-max_dt, -duration_mnth) ### Columns to exclude
row.names(data_clust) <- hub_pa_agg_wide$HubRandomID #### Row names for the HCLUST
data_clust_dist <- dist(data_clust) #### This creates a distance matrix from each column
cluster <- hclust(data_clust_dist) ##### This is the command to create the heirchical cluster
dend <- as.dendrogram(cluster) #### This is the function to create the dendrogram to display different style diagram
#### Several different ways to present the heirarchical diagram:
#plot(cluster, main= "Program Activity Types Per Month Similarity Clustering", cex=1, hang = -1, axes = FALSE)
#plot(dend, main= "MSO vs Condition Clustering", cex=0.9, hang = -1, axes = FALSE)
#plot(dend, hang = -1, type = "triangle")
hub_pa_agg_wide [cluster$order,]
five_a<- hub_pa_agg_wide [cluster$order,]
write.table(five_a, "five_a.xls", sep="|")
plot(cluster, main= "Program Activity Types Per Month Similarity Clustering", cex=1, hang = -1, axes = FALSE)
Datathon <- read.csv("~/Google Drive/Data Science/Datathon for Social Good/datathon-social-good-master/Datathon data/Internal data track/Datathon.csv")
### Get a long table for all SRS for all Hubs
hub_pa_agg <- Datathon %>% filter(CategoryName == "Programmed Activities" & (ChildParticipants != 0 | AdultParticipants !=0 )) %>% group_by(HubRandomID, ShortName ) %>% summarise(
cnt = n(),
cnt_child = sum(ChildParticipants),
cnt_adult = sum(AdultParticipants),
avg_cnt_pa = ((cnt_child + cnt_adult) / cnt), # avg attendance per session
avg_child_pa = cnt_child/cnt, # average child attendance per session
avg_adult_pa = cnt_adult/cnt) # average adult attendance per session
# get min and max dates for each hub and join back to table
#hub_dts <- Datathon %>% group_by(HubRandomID) %>% summarise(min_dt = min(ActivityDate),
# max_dt = max(ActivityDate),
# duration = max_dt - min_dt
# )
# Join maxmin dates back to table:
hub_pa_agg <- hub_pa_agg %>% inner_join(hub_dts, by= c("HubRandomID" = "HubRandomID"))
# Develop new metrics - avg per month:
#hub_pa_agg <- hub_pa_agg %>% mutate(duration_mnth = as.numeric((duration/365)*12),
# avg_cnt_mnth = cnt/ duration_mnth
# )
# Develop wide table with column for each SR based on avg cnt per month
hub_pa_agg_wide <- hub_pa_agg %>% select(HubRandomID, ShortName,avg_cnt_pa) %>% pivot_wider(names_from = ShortName, values_from = c(avg_cnt_pa), values_fill = list( avg_cnt_pa = 0))
data_clust <- hub_pa_agg_wide %>% ungroup() %>%select(-HubRandomID) ### Columns to exclude
row.names(data_clust) <- hub_pa_agg_wide$HubRandomID #### Row names for the HCLUST
data_clust_dist <- dist(data_clust) #### This creates a distance matrix from each column
cluster <- hclust(data_clust_dist) ##### This is the command to create the heirchical cluster
dend <- as.dendrogram(cluster) #### This is the function to create the dendrogram to display different style diagram
#### Several different ways to present the heirarchical diagram:
plot(cluster, main= "Program Activity Types Per Month Similarity Clustering", cex=1, hang = -1, axes = FALSE)
#plot(dend, main= "MSO vs Condition Clustering", cex=0.9, hang = -1, axes = FALSE)
#plot(dend, hang = -1, type = "triangle")
hub_pa_agg_wide [cluster$order,]
five_b<- hub_pa_agg_wide [cluster$order,]
#write.table(five_b, "five_b.xls", sep="|")